# Import python libraries
from pymongo import MongoClient
from pprint import pprint
from bson import json_util
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import missingno as msno
import plotly.express as px
import plotly.graph_objects as go
import opendatasets as od
# Download CSV file from Kaggle into Jupyter Notebook
od.download("https://www.kaggle.com/datasets/hasibalmuzdadid/global-air-pollution-dataset")
Skipping, found downloaded files in ".\global-air-pollution-dataset" (use force=True to force download)
# Reading the CSV file
file =('global-air-pollution-dataset/\global air pollution dataset.csv')
df = pd.read_csv(file)
df
| Country | City | AQI Value | AQI Category | CO AQI Value | CO AQI Category | Ozone AQI Value | Ozone AQI Category | NO2 AQI Value | NO2 AQI Category | PM2.5 AQI Value | PM2.5 AQI Category | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Russian Federation | Praskoveya | 51 | Moderate | 1 | Good | 36 | Good | 0 | Good | 51 | Moderate |
| 1 | Brazil | Presidente Dutra | 41 | Good | 1 | Good | 5 | Good | 1 | Good | 41 | Good |
| 2 | Italy | Priolo Gargallo | 66 | Moderate | 1 | Good | 39 | Good | 2 | Good | 66 | Moderate |
| 3 | Poland | Przasnysz | 34 | Good | 1 | Good | 34 | Good | 0 | Good | 20 | Good |
| 4 | France | Punaauia | 22 | Good | 0 | Good | 22 | Good | 0 | Good | 6 | Good |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 23458 | India | Gursahaiganj | 184 | Unhealthy | 3 | Good | 154 | Unhealthy | 2 | Good | 184 | Unhealthy |
| 23459 | France | Sceaux | 50 | Good | 1 | Good | 20 | Good | 5 | Good | 50 | Good |
| 23460 | India | Mormugao | 50 | Good | 1 | Good | 22 | Good | 1 | Good | 50 | Good |
| 23461 | United States of America | Westerville | 71 | Moderate | 1 | Good | 44 | Good | 2 | Good | 71 | Moderate |
| 23462 | Malaysia | Marang | 70 | Moderate | 1 | Good | 38 | Good | 0 | Good | 70 | Moderate |
23463 rows × 12 columns
# Print the information about a DataFrame
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 23463 entries, 0 to 23462 Data columns (total 12 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Country 23036 non-null object 1 City 23462 non-null object 2 AQI Value 23463 non-null int64 3 AQI Category 23463 non-null object 4 CO AQI Value 23463 non-null int64 5 CO AQI Category 23463 non-null object 6 Ozone AQI Value 23463 non-null int64 7 Ozone AQI Category 23463 non-null object 8 NO2 AQI Value 23463 non-null int64 9 NO2 AQI Category 23463 non-null object 10 PM2.5 AQI Value 23463 non-null int64 11 PM2.5 AQI Category 23463 non-null object dtypes: int64(5), object(7) memory usage: 2.1+ MB
#Rename columns in DataFrame
df.rename(columns =
{'AQI Value ':'AQIvalue ',
'AQI Category ':'AQIcategory ',
'CO AQI Value':'COvalue ',
'CO AQI Category':'COcategory',
'Ozone AQI Value':'OZONEvalue',
'Ozone AQI Category':'OZONEcategory',
'NO2 AQI Value':'NO2value',
'NO2 AQI Category':'NO2category',
'PM2.5 AQI Value':'PM2-5value',
'PM2.5 AQI Category':'PM2-5category'}
, inplace = True)
#Visualize missing values (NaN) values using Missingno Library
msno.matrix(df)
plt.show()
# Get a sum of null values from each column
df.isna().sum()
Country 427 City 1 AQI Value 0 AQI Category 0 COvalue 0 COcategory 0 OZONEvalue 0 OZONEcategory 0 NO2value 0 NO2category 0 PM2-5value 0 PM2-5category 0 dtype: int64
# Remove the rows that contains NULL values.
df = df.dropna()
#Recheck Null value after using "dropna()"
msno.matrix(df)
plt.show()
# Recheck Null value after using "dropna()"
df.isna().sum()
Country 0 City 0 AQI Value 0 AQI Category 0 COvalue 0 COcategory 0 OZONEvalue 0 OZONEcategory 0 NO2value 0 NO2category 0 PM2-5value 0 PM2-5category 0 dtype: int64
# Analyze duplicate values
duplicate = df[df.duplicated()]
print("Duplicate Rows : ", duplicate)
Duplicate Rows : Empty DataFrame Columns: [Country, City, AQI Value, AQI Category, COvalue , COcategory, OZONEvalue, OZONEcategory, NO2value, NO2category, PM2-5value, PM2-5category] Index: []
# Generate descriptive statistics
describe = df.describe()
describe.round(2)
| AQI Value | COvalue | OZONEvalue | NO2value | PM2-5value | |
|---|---|---|---|---|---|
| count | 23035.00 | 23035.00 | 23035.00 | 23035.00 | 23035.00 |
| mean | 72.34 | 1.38 | 35.23 | 3.08 | 68.88 |
| std | 56.36 | 1.84 | 28.24 | 5.28 | 55.06 |
| min | 6.00 | 0.00 | 0.00 | 0.00 | 0.00 |
| 25% | 39.00 | 1.00 | 21.00 | 0.00 | 35.00 |
| 50% | 55.00 | 1.00 | 31.00 | 1.00 | 54.00 |
| 75% | 80.00 | 1.00 | 40.00 | 4.00 | 79.00 |
| max | 500.00 | 133.00 | 235.00 | 91.00 | 500.00 |
# Set up a connection to the MongoDB cluster
client = MongoClient("mongodb+srv://wunchana:golf@cluster0.ly9xn5f.mongodb.net/test")
# Create a database name 'global_air_pollution'
db = client['global_air_pollution']
# Create a collection name 'air_pollution' in 'global_air_pollution' database
collection = db['air_pollution']
# Convert the DataFrame to a list of dictionaries
air_data = df.to_dict('records')
# Inserts multiple documents into a collection
collection.insert_many(air_data)
<pymongo.results.InsertManyResult at 0x259ac9fd3a0>
#Check server status
serverStatusResult=db.command("serverStatus")
serverStatusResult
{'host': 'ac-6ptj635-shard-00-01.ly9xn5f.mongodb.net:27017',
'version': '6.0.5',
'process': 'mongod',
'pid': 2977,
'uptime': 420292.0,
'uptimeMillis': 420291591,
'uptimeEstimate': 420291,
'localTime': datetime.datetime(2023, 4, 30, 16, 54, 30, 196000),
'asserts': {'regular': 0, 'warning': 0, 'msg': 0, 'user': 0, 'rollovers': 0},
'connections': {'current': 16, 'available': 484, 'totalCreated': 450},
'extra_info': {'note': 'fields vary by platform', 'page_faults': 0},
'network': {'bytesIn': 117001412,
'bytesOut': 308293423,
'numRequests': 58427},
'opcounters': {'insert': 415486,
'query': 148,
'update': 0,
'delete': 0,
'getmore': 152,
'command': 58104,
'deprecated': {'query': 0, 'getmore': 0}},
'opcountersRepl': {'insert': 0,
'query': 0,
'update': 0,
'delete': 0,
'getmore': 0,
'command': 0,
'deprecated': {'query': 0, 'getmore': 0}},
'repl': {'topologyVersion': {'processId': ObjectId('644833826dde9f662f72a4c6'),
'counter': 6},
'hosts': ['ac-6ptj635-shard-00-00.ly9xn5f.mongodb.net:27017',
'ac-6ptj635-shard-00-01.ly9xn5f.mongodb.net:27017',
'ac-6ptj635-shard-00-02.ly9xn5f.mongodb.net:27017'],
'setName': 'atlas-rvizeg-shard-0',
'setVersion': 10,
'isWritablePrimary': True,
'secondary': False,
'primary': 'ac-6ptj635-shard-00-01.ly9xn5f.mongodb.net:27017',
'tags': {'region': 'US_EAST_1',
'workloadType': 'OPERATIONAL',
'provider': 'AWS',
'nodeType': 'ELECTABLE'},
'me': 'ac-6ptj635-shard-00-01.ly9xn5f.mongodb.net:27017',
'electionId': ObjectId('7fffffff00000000000000cf'),
'lastWrite': {'opTime': {'ts': Timestamp(1682873670, 4), 't': 207},
'lastWriteDate': datetime.datetime(2023, 4, 30, 16, 54, 30),
'majorityOpTime': {'ts': Timestamp(1682873670, 4), 't': 207},
'majorityWriteDate': datetime.datetime(2023, 4, 30, 16, 54, 30)},
'primaryOnlyServices': {'TenantMigrationRecipientService': {'state': 'running',
'numInstances': 0},
'TenantMigrationDonorService': {'state': 'running', 'numInstances': 0},
'ShardSplitDonorService': {'state': 'running', 'numInstances': 0}},
'rbid': 3,
'userWriteBlockMode': 1},
'storageEngine': {'name': 'wiredTiger',
'supportsCommittedReads': True,
'oldestRequiredTimestampForCrashRecovery': Timestamp(1682873639, 25),
'supportsPendingDrops': True,
'dropPendingIdents': 2,
'supportsSnapshotReadConcern': True,
'readOnly': False,
'persistent': True,
'backupCursorOpen': False},
'mem': {'bits': 64,
'resident': 0,
'virtual': 0,
'supported': True,
'mapped': 0,
'mappedWithJournal': 0},
'metrics': {'aggStageCounters': {'search': 0,
'searchBeta': 0,
'searchMeta': 0},
'operatorCounters': {'match': {'text': 0, 'regex': 0}},
'atlas': {'connectionPool': {'totalCreated': 124212}}},
'ok': 1.0,
'$clusterTime': {'clusterTime': Timestamp(1682873670, 4),
'signature': {'hash': b'\x7f\\\xd4W\xbe\xb2D\x94Ru#.O(*\xc1\x016\x03^',
'keyId': 7193436090353582082}},
'operationTime': Timestamp(1682873670, 4),
'opLatencies': {'reads': {'latency': 14101021, 'ops': 598},
'writes': {'latency': 118224344, 'ops': 18},
'commands': {'latency': 469106712034, 'ops': 57796}},
'atlasVersion': {'version': '20230412.0.0.1681399681',
'gitVersion': '28f3fc529d91bbe3b73f9abc34cce793ca7d3fa8'}}
# Check the list of collection present in the global_air_pollution
print(db.list_collection_names())
['air_pollution']
# Count the number of documents in the collection
num_document = collection.count_documents({})
print(num_document)
23035
df = db.air_pollution.find({})
df = pd.DataFrame(list(df))
df.head(5)
| _id | Country | City | AQI Value | AQI Category | COvalue | COcategory | OZONEvalue | OZONEcategory | NO2value | NO2category | PM2-5value | PM2-5category | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 644e9d3add11d9e1f4df6fb4 | Russian Federation | Praskoveya | 51 | Moderate | 1 | Good | 36 | Good | 0 | Good | 51 | Moderate |
| 1 | 644e9d3add11d9e1f4df6fb5 | Brazil | Presidente Dutra | 41 | Good | 1 | Good | 5 | Good | 1 | Good | 41 | Good |
| 2 | 644e9d3add11d9e1f4df6fb6 | Italy | Priolo Gargallo | 66 | Moderate | 1 | Good | 39 | Good | 2 | Good | 66 | Moderate |
| 3 | 644e9d3add11d9e1f4df6fb7 | Poland | Przasnysz | 34 | Good | 1 | Good | 34 | Good | 0 | Good | 20 | Good |
| 4 | 644e9d3add11d9e1f4df6fb8 | France | Punaauia | 22 | Good | 0 | Good | 22 | Good | 0 | Good | 6 | Good |
# Print example of document in the collection
sample_document = collection.find_one()
sample_document
{'_id': ObjectId('644e9d3add11d9e1f4df6fb4'),
'Country': 'Russian Federation',
'City': 'Praskoveya',
'AQI Value': 51,
'AQI Category': 'Moderate',
'COvalue ': 1,
'COcategory': 'Good',
'OZONEvalue': 36,
'OZONEcategory': 'Good',
'NO2value': 0,
'NO2category': 'Good',
'PM2-5value': 51,
'PM2-5category': 'Moderate'}
# Determine unique counties in the dataset
unique_countries = len(collection.distinct('Country'))
print(unique_countries)
175
# Determine unique cities in the dataset
unique_cities = len(collection.distinct('City'))
print(unique_cities)
23035
# Find the top 100 cities with the highest PM2.5 values
pipeline = [
{'$sort': {'PM2-5value': -1}},
{'$limit': 100},
{'$project': {'_id': 0, 'City': 1, 'Country': 1, 'PM2-5value': 1}}
]
df = pd.DataFrame(list(collection.aggregate(pipeline)))
df
| Country | City | PM2-5value | |
|---|---|---|---|
| 0 | United States of America | Durango | 500 |
| 1 | India | Sardulgarh | 500 |
| 2 | India | Nilokheri | 500 |
| 3 | India | Moradabad | 500 |
| 4 | India | Churu | 500 |
| ... | ... | ... | ... |
| 95 | India | Khairabad | 403 |
| 96 | India | Doraha | 400 |
| 97 | India | Lachhmangarh | 400 |
| 98 | India | Balotra | 398 |
| 99 | India | Ujhani | 396 |
100 rows × 3 columns
#Calculate the average AQI value for each country and plot into horizontal bar chart
pipeline = [{'$group': {'_id': '$Country',
'avg_AQIvalue': {'$avg': '$AQI Value'},
'stddev_AQIvalue': {'$stdDevPop': '$AQI Value'}}},
{'$sort': {'avg_AQIvalue': 1}}]
avgAQI = pd.DataFrame(list(collection.aggregate(pipeline)))
# Print top 5 countries with highest average AQI values
avgAQI_sorted = avgAQI.sort_values(by=['avg_AQIvalue'], ascending=False)
avgAQI_sorted.head(5)
| _id | avg_AQIvalue | stddev_AQIvalue | |
|---|---|---|---|
| 174 | Republic of Korea | 421.000000 | 0.000000 |
| 173 | Bahrain | 188.000000 | 0.000000 |
| 172 | Mauritania | 179.000000 | 26.353368 |
| 171 | Pakistan | 178.788274 | 61.108790 |
| 170 | United Arab Emirates | 163.666667 | 8.259674 |
# Plot a horizontal bar chart of average AQI using Plotly
fig = go.Figure()
fig.add_trace(go.Bar(x = avgAQI['avg_AQIvalue'], y = avgAQI['_id'], orientation = 'h'))
fig.update_layout(title = {
'text': '<b>Countries ranked by average AQI values<b>',
'x': 0.5,
'y': 0.95},
xaxis_title = '<b>Average AQI values<b>',
yaxis_title = '<b>Country<b>',
width = 800, height = 600)
fig.show()
#Calculate the average CO values in each country and plot data into horizontal bar chart
pipeline = [{'$group': {'_id': '$Country',
'avg_CO_value': {'$avg': '$COvalue '},
'stddev_CO_value': {'$stdDevPop': '$COvalue '}}},
{'$sort': {'avg_CO_value': 1}}]
avgCO = pd.DataFrame(list(collection.aggregate(pipeline)))
# Print top 5 countries with highest average CO values
avgCO_sorted = avgCO.sort_values(by=['avg_CO_value'], ascending=False)
avgCO_sorted.head(5)
| _id | avg_CO_value | stddev_CO_value | |
|---|---|---|---|
| 174 | Republic of Korea | 27.000000 | 0.000000 |
| 173 | South Africa | 5.379310 | 7.238533 |
| 172 | Democratic Republic of the Congo | 5.285714 | 8.083745 |
| 171 | Kingdom of Eswatini | 4.666667 | 3.091206 |
| 170 | Nigeria | 3.812500 | 2.710496 |
# Plot a horizontal bar chart of average CO usig Plotly
fig = go.Figure()
fig.add_trace(go.Bar(
x = avgCO['avg_CO_value'],
y = avgCO['_id'],
orientation = 'h'))
fig.update_layout(title = {
'text': '<b>Countries ranked by average CO values<b>',
'x': 0.5,
'y': 0.95},
xaxis_title = '<b>Average CO values<b>',
yaxis_title = '<b>Country<b>',
width = 800, height = 600)
fig.show()
#Calculate the average Ozone values in each country and plot data into horizontal bar chart
pipeline = [{'$group': {'_id': '$Country',
'avg_OZONE_value': {'$avg': '$OZONEvalue'},
'stddev_OZONE_value': {'$stdDevPop': '$OZONEvalue'}}},
{'$sort': {'avg_OZONE_value': 1}}]
avgOZONE = pd.DataFrame(list(collection.aggregate(pipeline)))
# Print top 5 countries with highest average Ozone values
avgOZONE_sorted = avgOZONE.sort_values(by=['avg_OZONE_value'], ascending=False)
avgOZONE_sorted.head(5)
| _id | avg_OZONE_value | stddev_OZONE_value | |
|---|---|---|---|
| 174 | United Arab Emirates | 159.666667 | 13.912425 |
| 173 | Qatar | 139.000000 | 25.000000 |
| 172 | Kuwait | 135.666667 | 16.499158 |
| 171 | Bahrain | 127.000000 | 0.000000 |
| 170 | Oman | 97.833333 | 40.736620 |
# Plot a horizontal bar chart of average Ozone usig Plotly
fig = go.Figure()
fig.add_trace(go.Bar(
x = avgOZONE['avg_OZONE_value'],
y = avgOZONE['_id'],
orientation = 'h'))
fig.update_layout(title = {
'text': '<b>Countries ranked by average Ozone values<b>',
'x': 0.5,
'y': 0.95},
xaxis_title = '<b>Average Ozone values<b>',
yaxis_title = '<b>Country<b>',
width = 800, height = 600)
fig.show()
#Calculate the average NO2 values in each country and plot data into horizontal bar chart
pipeline = [{'$group': {'_id': '$Country',
'avg_NO2_value': {'$avg': '$NO2value'},
'stddev_NO2_value': {'$stdDevPop': '$NO2value'}}},
{'$sort': {'avg_NO2_value': 1}}]
avgNO2 = pd.DataFrame(list(collection.aggregate(pipeline)))
# Print top 5 countries with highest average NO2 values
avgNO2_sorted = avgNO2.sort_values(by=['avg_NO2_value'], ascending=False)
avgNO2_sorted.head(5)
| _id | avg_NO2_value | stddev_NO2_value | |
|---|---|---|---|
| 174 | Republic of Korea | 91.000000 | 0.000000 |
| 173 | Kuwait | 13.666667 | 6.128259 |
| 172 | El Salvador | 9.393939 | 4.532304 |
| 171 | Algeria | 9.333333 | 18.966344 |
| 170 | Chile | 8.592593 | 8.067275 |
# Plot a horizontal bar chart of average NO2 usig Plotly
fig = go.Figure()
fig.add_trace(go.Bar(
x = avgNO2['avg_NO2_value'],
y = avgNO2['_id'],
orientation = 'h'))
fig.update_layout(title = {
'text': '<b>Countries ranked by average NO2 values<b>',
'x': 0.5,
'y': 0.95},
xaxis_title = '<b>Average NO2 values<b>',
yaxis_title = '<b>Country<b>',
width = 800, height = 600)
fig.show()
#Calculate the average PM2.5 value in each country and plot data into horizontal bar chart
pipeline = [{'$group': {'_id': '$Country',
'avg_PM2-5value': {'$avg': '$PM2-5value'},
'stddev_PM2-5value': {'$stdDevPop': '$PM2-5value'}}},
{'$sort': {'avg_PM2-5value': 1}}]
avgPM25 = pd.DataFrame(list(collection.aggregate(pipeline)))
# Print top 5 countries with highest average PM2.5 values
avgPM25_sorted = avgPM25.sort_values(by=['avg_PM2-5value'], ascending=False)
avgPM25_sorted.head(5)
| _id | avg_PM2-5value | stddev_PM2-5value | |
|---|---|---|---|
| 174 | Republic of Korea | 415.000000 | 0.000000 |
| 173 | Bahrain | 188.000000 | 0.000000 |
| 172 | Mauritania | 179.000000 | 26.353368 |
| 171 | Pakistan | 173.110749 | 53.404554 |
| 170 | Aruba | 163.000000 | 0.000000 |
# Plot a horizontal bar chart of average PM2.5 usig Plotly
fig = go.Figure()
fig.add_trace(go.Bar(
x=avgPM25['avg_PM2-5value'],
y=avgPM25['_id'],
orientation='h'))
fig.update_layout(title={
'text': '<b>Countries ranked by average PM2.5 values<b>',
'x': 0.5,
'y': 0.95},
xaxis_title='<b>Average PM2.5 values<b>',
yaxis_title='<b>Country<b>',
width=800, height=600)
fig.show()
# The distribution of the AQI category in all cities worldwide using pie chart
pipeline = [{"$group": {"_id": "$AQI Category", "count": {"$sum": 1}}}]
AQI_categories = pd.DataFrame(list(collection.aggregate(pipeline)))
fig = px.pie(AQI_categories,
values = 'count',
names = '_id',
color_discrete_sequence = px.colors.qualitative.Plotly)
fig.update_layout(
title = '<b>Distribution of AQI Categories Worldwide<b>',
title_x = 0.15,
font = dict(size = 12)
)
fig.show()
# Print the distribution of the AQI category in percentages
total_count = AQI_categories['count'].sum()
AQI_categories['percentage'] = (AQI_categories['count'] / total_count) * 100
AQI_categories
| _id | count | percentage | |
|---|---|---|---|
| 0 | Moderate | 9087 | 39.448665 |
| 1 | Unhealthy for Sensitive Groups | 1568 | 6.807033 |
| 2 | Unhealthy | 2215 | 9.615802 |
| 3 | Hazardous | 191 | 0.829173 |
| 4 | Good | 9688 | 42.057738 |
| 5 | Very Unhealthy | 286 | 1.241589 |
# The distribution of the CO category in all cities worldwide using pie chart
pipeline = [{"$group": {"_id": "$COcategory", "count": {"$sum": 1}}}]
CO_categories = pd.DataFrame(list(collection.aggregate(pipeline)))
fig = px.pie(CO_categories,
values = 'count',
names = '_id',
color_discrete_sequence = px.colors.qualitative.Plotly)
fig.update_layout(
title = '<b>Distribution of CO Categories Worldwide<b>',
title_x = 0.15,
font = dict(size = 12)
)
fig.show()
# Print the distribution of the CO category in percentages
total_count = CO_categories['count'].sum()
CO_categories['percentage'] = (CO_categories['count'] / total_count) * 100
CO_categories
| _id | count | percentage | |
|---|---|---|---|
| 0 | Unhealthy for Sensitive Groups | 1 | 0.004341 |
| 1 | Moderate | 2 | 0.008682 |
| 2 | Good | 23032 | 99.986976 |
# The distribution of the Ozone category in all cities worldwide using pie chart
pipeline = [{"$group": {"_id": "$OZONEcategory", "count": {"$sum": 1}}}]
OZONE_categories = pd.DataFrame(list(collection.aggregate(pipeline)))
fig = px.pie(OZONE_categories,
values = 'count',
names = '_id',
color_discrete_sequence = px.colors.qualitative.Plotly)
fig.update_layout(
title = '<b>Distribution of OZONE Categories Worldwide<b>',
title_x = 0.15,
font = dict(size = 12))
fig.show()
# Print the distribution of the OZONE category in percentages
total_count = OZONE_categories['count'].sum()
OZONE_categories['percentage'] = (OZONE_categories['count'] / total_count) * 100
OZONE_categories
| _id | count | percentage | |
|---|---|---|---|
| 0 | Moderate | 1419 | 6.160191 |
| 1 | Unhealthy for Sensitive Groups | 488 | 2.118515 |
| 2 | Unhealthy | 404 | 1.753853 |
| 3 | Good | 20672 | 89.741697 |
| 4 | Very Unhealthy | 52 | 0.225743 |
# The distribution of the NO2 category in all cities worldwide using pie chart
pipeline = [{"$group": {"_id": "$NO2category", "count": {"$sum": 1}}}]
NO2_categories = pd.DataFrame(list(collection.aggregate(pipeline)))
fig = px.pie(NO2_categories,
values = 'count',
names = '_id',
color_discrete_sequence = px.colors.qualitative.Plotly)
fig.update_layout(
title = '<b>Distribution of NO2 Categories Worldwide<b>',
title_x = 0.15,
font = dict(size = 12)
)
fig.show()
# Print the distribution of the NO2 category in percentages
total_count = NO2_categories['count'].sum()
NO2_categories['percentage'] = (NO2_categories['count'] / total_count) * 100
NO2_categories
| _id | count | percentage | |
|---|---|---|---|
| 0 | Moderate | 15 | 0.065118 |
| 1 | Good | 23020 | 99.934882 |
# The distribution of the PM2.5 category in all cities worldwide using pie chart
pipeline = [{"$group": {"_id": "$PM2-5category", "count": {"$sum": 1}}}]
PM2_5_categories = pd.DataFrame(list(collection.aggregate(pipeline)))
fig = px.pie(PM2_5_categories,
values = 'count',
names = '_id',
color_discrete_sequence = px.colors.qualitative.Plotly)
fig.update_layout(
title = '<b>Distribution of PM2.5 Categories Worldwide<b>',
title_x = 0.15,
font = dict(size = 12)
)
fig.show()
# Print the distribution of the PM2.5 category in percentages
total_count = PM2_5_categories['count'].sum()
PM2_5_categories['percentage'] = (PM2_5_categories['count'] / total_count) * 100
PM2_5_categories
| _id | count | percentage | |
|---|---|---|---|
| 0 | Good | 9950 | 43.195138 |
| 1 | Very Unhealthy | 255 | 1.107011 |
| 2 | Moderate | 8939 | 38.806165 |
| 3 | Unhealthy for Sensitive Groups | 1601 | 6.950293 |
| 4 | Unhealthy | 2118 | 9.194704 |
| 5 | Hazardous | 172 | 0.746690 |
# Find the top 25 countries by average and SD of AQI value
pipeline = [{'$group': {'_id': '$Country',
'avg_AQIvalue': {'$avg': '$AQI Value'},
'stddev_AQIvalue': {'$stdDevPop': '$AQI Value'}}},
{'$sort': {'avg_AQIvalue': -1}},
{'$limit': 25}]
top25_avg_AQIvalue = list(collection.aggregate(pipeline))
top25_avg_AQI = pd.DataFrame(top25_avg_AQIvalue)
top25_avg_AQI
| _id | avg_AQIvalue | stddev_AQIvalue | |
|---|---|---|---|
| 0 | Republic of Korea | 421.000000 | 0.000000 |
| 1 | Bahrain | 188.000000 | 0.000000 |
| 2 | Mauritania | 179.000000 | 26.353368 |
| 3 | Pakistan | 178.788274 | 61.108790 |
| 4 | United Arab Emirates | 163.666667 | 8.259674 |
| 5 | Aruba | 163.000000 | 0.000000 |
| 6 | Kuwait | 162.000000 | 1.414214 |
| 7 | Qatar | 157.500000 | 6.500000 |
| 8 | India | 152.964228 | 94.298932 |
| 9 | Senegal | 152.424242 | 31.411280 |
| 10 | Saudi Arabia | 149.285714 | 34.917103 |
| 11 | Gambia | 147.000000 | 15.762121 |
| 12 | Yemen | 144.571429 | 27.943091 |
| 13 | Guinea-Bissau | 138.750000 | 33.394423 |
| 14 | Oman | 137.500000 | 26.856098 |
| 15 | China | 126.953459 | 51.195731 |
| 16 | Kingdom of Eswatini | 119.333333 | 38.560198 |
| 17 | Uzbekistan | 119.086207 | 40.591865 |
| 18 | Nepal | 116.363636 | 49.931965 |
| 19 | Tajikistan | 115.206897 | 24.803197 |
| 20 | Democratic Republic of the Congo | 114.814286 | 88.150244 |
| 21 | Iraq | 114.071429 | 30.918012 |
| 22 | Bangladesh | 113.462500 | 49.621554 |
| 23 | South Africa | 113.040230 | 72.013677 |
| 24 | Iran (Islamic Republic of) | 108.296089 | 42.221636 |
# Print Top 25 countries with highest average AQI value
Top_25_countries = [doc['_id'] for doc in top25_avg_AQIvalue]
Top_25_countries
['Republic of Korea', 'Bahrain', 'Mauritania', 'Pakistan', 'United Arab Emirates', 'Aruba', 'Kuwait', 'Qatar', 'India', 'Senegal', 'Saudi Arabia', 'Gambia', 'Yemen', 'Guinea-Bissau', 'Oman', 'China', 'Kingdom of Eswatini', 'Uzbekistan', 'Nepal', 'Tajikistan', 'Democratic Republic of the Congo', 'Iraq', 'Bangladesh', 'South Africa', 'Iran (Islamic Republic of)']
# Get the information for the top 25 countries
df = pd.DataFrame(list(collection.find({'Country': {'$in': [c['_id'] for c in top25_avg_AQIvalue]}},
{'_id': 0,
'Country': 1,
'City': 1,
'AQI Value': 1,
'AQI Category': 1})))
# Plot a treemap using data from Top 25 countries with highest average AQI value
fig = px.treemap(df, path = ['Country', 'City'],
values = 'AQI Value',
color = 'AQI Value',
hover_data = ['AQI Value', 'AQI Category'],
color_continuous_scale = 'Jet')
fig.update_layout(title = {
'text': '<b>Top 25 countries with highest average AQI value<b>',
'x': 0.5,
'y': 0.95},
width = 1000, height = 800)
fig.show()
# Find the top 25 countries by average and SD of PM2.5 value
pipeline = [{'$group': {'_id': '$Country',
'avg_PM2-5value': {'$avg': '$PM2-5value'},
'stddev_PM2-5value': {'$stdDevPop': '$PM2-5value'}}},
{'$sort': {'avg_PM2-5value': -1}},
{'$limit': 25}]
top25_avg_PM2value = list(collection.aggregate(pipeline))
# Create the stacked bar chart using 'AQI Category' in Top 25 Countries with highest average AQI value
aqi_categories = ['Good', 'Moderate', 'Unhealthy for Sensitive Groups', 'Unhealthy', 'Very Unhealthy', 'Hazardous']
# Group and pivot the data
grouped_data = df.groupby(['Country', 'AQI Category']).size().reset_index(name = 'count')
pivot_data = grouped_data.pivot(index = 'Country', columns = 'AQI Category', values = 'count').fillna(0)
pivot_data
| AQI Category | Good | Hazardous | Moderate | Unhealthy | Unhealthy for Sensitive Groups | Very Unhealthy |
|---|---|---|---|---|---|---|
| Country | ||||||
| Aruba | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 |
| Bahrain | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 |
| Bangladesh | 4.0 | 0.0 | 36.0 | 26.0 | 12.0 | 2.0 |
| China | 57.0 | 3.0 | 213.0 | 294.0 | 183.0 | 45.0 |
| Democratic Republic of the Congo | 9.0 | 3.0 | 38.0 | 8.0 | 6.0 | 6.0 |
| Gambia | 0.0 | 0.0 | 0.0 | 7.0 | 2.0 | 0.0 |
| Guinea-Bissau | 0.0 | 0.0 | 0.0 | 1.0 | 3.0 | 0.0 |
| India | 130.0 | 158.0 | 585.0 | 1101.0 | 383.0 | 131.0 |
| Iran (Islamic Republic of) | 14.0 | 0.0 | 72.0 | 39.0 | 50.0 | 4.0 |
| Iraq | 0.0 | 0.0 | 4.0 | 2.0 | 8.0 | 0.0 |
| Kingdom of Eswatini | 0.0 | 0.0 | 1.0 | 1.0 | 1.0 | 0.0 |
| Kuwait | 0.0 | 0.0 | 0.0 | 3.0 | 0.0 | 0.0 |
| Mauritania | 0.0 | 0.0 | 0.0 | 3.0 | 0.0 | 1.0 |
| Nepal | 1.0 | 0.0 | 12.0 | 6.0 | 11.0 | 3.0 |
| Oman | 0.0 | 0.0 | 1.0 | 4.0 | 1.0 | 0.0 |
| Pakistan | 0.0 | 13.0 | 10.0 | 229.0 | 31.0 | 24.0 |
| Qatar | 0.0 | 0.0 | 0.0 | 2.0 | 0.0 | 0.0 |
| Republic of Korea | 0.0 | 1.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| Saudi Arabia | 0.0 | 0.0 | 3.0 | 9.0 | 2.0 | 0.0 |
| Senegal | 1.0 | 0.0 | 0.0 | 23.0 | 8.0 | 1.0 |
| South Africa | 28.0 | 3.0 | 70.0 | 36.0 | 22.0 | 15.0 |
| Tajikistan | 0.0 | 0.0 | 9.0 | 3.0 | 17.0 | 0.0 |
| United Arab Emirates | 0.0 | 0.0 | 0.0 | 3.0 | 0.0 | 0.0 |
| Uzbekistan | 0.0 | 1.0 | 21.0 | 13.0 | 23.0 | 0.0 |
| Yemen | 0.0 | 0.0 | 1.0 | 4.0 | 2.0 | 0.0 |
# Create the stacked bar chart
fig = px.bar(pivot_data,
x = pivot_data.index,
y = aqi_categories,
title = '<b>AQI Categories in Top 25 Countries with highest average AQI value<b>',
color_discrete_sequence = px.colors.qualitative.Set3,
category_orders = {'AQI Category': aqi_categories})
fig.update_layout(barmode = 'stack',
xaxis_title = '<b>Country<b>',
yaxis_title = '<b>Number of Cities<b>',
legend_title = '<b>AQI Category<b>',
margin = dict(t = 50, l = 25, r = 25, b = 25))
fig.show()
# Create the stacked bar chart using 'AQI Category' in percentages of Top 25 Countries with highest average AQI value
# Group and pivot the data
grouped_data = df.groupby(['Country', 'AQI Category']).size().reset_index(name = 'count')
grouped_data['AQI Category'] = pd.Categorical(grouped_data['AQI Category'], categories = aqi_categories, ordered=True)
pivot_data = grouped_data.pivot(index = 'Country', columns = 'AQI Category', values = 'count').fillna(0)
# Convert counts to percentages
pivot_data_percentage = pivot_data.apply(lambda x: x / x.sum() * 100, axis = 1)
pivot_data_percentage
| AQI Category | Good | Moderate | Unhealthy for Sensitive Groups | Unhealthy | Very Unhealthy | Hazardous |
|---|---|---|---|---|---|---|
| Country | ||||||
| Aruba | 0.000000 | 0.000000 | 0.000000 | 100.000000 | 0.000000 | 0.000000 |
| Bahrain | 0.000000 | 0.000000 | 0.000000 | 100.000000 | 0.000000 | 0.000000 |
| Bangladesh | 5.000000 | 45.000000 | 15.000000 | 32.500000 | 2.500000 | 0.000000 |
| China | 7.169811 | 26.792453 | 23.018868 | 36.981132 | 5.660377 | 0.377358 |
| Democratic Republic of the Congo | 12.857143 | 54.285714 | 8.571429 | 11.428571 | 8.571429 | 4.285714 |
| Gambia | 0.000000 | 0.000000 | 22.222222 | 77.777778 | 0.000000 | 0.000000 |
| Guinea-Bissau | 0.000000 | 0.000000 | 75.000000 | 25.000000 | 0.000000 | 0.000000 |
| India | 5.225080 | 23.512862 | 15.393891 | 44.252412 | 5.265273 | 6.350482 |
| Iran (Islamic Republic of) | 7.821229 | 40.223464 | 27.932961 | 21.787709 | 2.234637 | 0.000000 |
| Iraq | 0.000000 | 28.571429 | 57.142857 | 14.285714 | 0.000000 | 0.000000 |
| Kingdom of Eswatini | 0.000000 | 33.333333 | 33.333333 | 33.333333 | 0.000000 | 0.000000 |
| Kuwait | 0.000000 | 0.000000 | 0.000000 | 100.000000 | 0.000000 | 0.000000 |
| Mauritania | 0.000000 | 0.000000 | 0.000000 | 75.000000 | 25.000000 | 0.000000 |
| Nepal | 3.030303 | 36.363636 | 33.333333 | 18.181818 | 9.090909 | 0.000000 |
| Oman | 0.000000 | 16.666667 | 16.666667 | 66.666667 | 0.000000 | 0.000000 |
| Pakistan | 0.000000 | 3.257329 | 10.097720 | 74.592834 | 7.817590 | 4.234528 |
| Qatar | 0.000000 | 0.000000 | 0.000000 | 100.000000 | 0.000000 | 0.000000 |
| Republic of Korea | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 100.000000 |
| Saudi Arabia | 0.000000 | 21.428571 | 14.285714 | 64.285714 | 0.000000 | 0.000000 |
| Senegal | 3.030303 | 0.000000 | 24.242424 | 69.696970 | 3.030303 | 0.000000 |
| South Africa | 16.091954 | 40.229885 | 12.643678 | 20.689655 | 8.620690 | 1.724138 |
| Tajikistan | 0.000000 | 31.034483 | 58.620690 | 10.344828 | 0.000000 | 0.000000 |
| United Arab Emirates | 0.000000 | 0.000000 | 0.000000 | 100.000000 | 0.000000 | 0.000000 |
| Uzbekistan | 0.000000 | 36.206897 | 39.655172 | 22.413793 | 0.000000 | 1.724138 |
| Yemen | 0.000000 | 14.285714 | 28.571429 | 57.142857 | 0.000000 | 0.000000 |
# Create the stacked bar chart
fig = px.bar(pivot_data_percentage,
x = pivot_data_percentage.index,
y = aqi_categories,
title = '<b>Percentages of AQI Categories in Top 25 Countries <b>',
color_discrete_sequence = px.colors.qualitative.Set3)
fig.update_layout(barmode = 'stack',
xaxis_title = '<b>Country<b>',
yaxis_title = '<b>% of Cities in each AQI Categories<b>',
legend_title = '<b>AQI Categories<b>',
margin = dict(t = 50, l = 25, r = 25, b = 25),
plot_bgcolor='white')
fig.show()
# Calculate sum of percentage of cities with poor air quality in each country
poor_air_quality = ['Unhealthy for Sensitive Groups', 'Unhealthy', 'Very Unhealthy', 'Hazardous']
sum_poor_air_quality = pivot_data_percentage[poor_air_quality].sum(axis = 1)
sum_poor_air_quality_sorted = sum_poor_air_quality.sort_values(ascending = False)
sum_poor_air_quality_sorted
Country Aruba 100.000000 Kuwait 100.000000 United Arab Emirates 100.000000 Republic of Korea 100.000000 Qatar 100.000000 Bahrain 100.000000 Mauritania 100.000000 Guinea-Bissau 100.000000 Gambia 100.000000 Senegal 96.969697 Pakistan 96.742671 Yemen 85.714286 Oman 83.333333 Saudi Arabia 78.571429 Iraq 71.428571 India 71.262058 Tajikistan 68.965517 Kingdom of Eswatini 66.666667 China 66.037736 Uzbekistan 63.793103 Nepal 60.606061 Iran (Islamic Republic of) 51.955307 Bangladesh 50.000000 South Africa 43.678161 Democratic Republic of the Congo 32.857143 dtype: float64
# Correlation between AQI and key air pollutants
df = pd.DataFrame(list(collection.find({},
{'_id': 0, 'AQI Value': 1,
'COvalue ': 1, 'OZONEvalue': 1,
'NO2value': 1, 'PM2-5value': 1})))
# Find and print correlation matrix
corr_matrix = df.corr().round(3)
corr_matrix
| AQI Value | COvalue | OZONEvalue | NO2value | PM2-5value | |
|---|---|---|---|---|---|
| AQI Value | 1.000 | 0.430 | 0.405 | 0.231 | 0.985 |
| COvalue | 0.430 | 1.000 | 0.145 | 0.488 | 0.438 |
| OZONEvalue | 0.405 | 0.145 | 1.000 | -0.183 | 0.340 |
| NO2value | 0.231 | 0.488 | -0.183 | 1.000 | 0.259 |
| PM2-5value | 0.985 | 0.438 | 0.340 | 0.259 | 1.000 |
# Plot correlogram
fig = px.imshow(corr_matrix,
text_auto = True,
color_continuous_scale = 'Viridis',
labels = dict(color = "<b>Correlation<b>"))
fig.update_layout(
title = '<b>Correlation between AQI and air pollutants<b>',
title_x = 0.21)
fig.update_xaxes(side = "top")
fig.show()